# We load the libraries that will be used.

library(caTools)
library(corrplot)
## corrplot 0.92 loaded
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(factoextra)
## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(foreign)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(leaflet)
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(viridis)
## Loading required package: viridisLite
# We load our data set.

Houses <- read.arff("house_sales_reduced.arff")

Preprocessing

Visualization of the dataset

Houses[1:9,]
##   attribute_0         id   price bedrooms bathrooms sqft_living sqft_lot floors
## 1           0 7129300520  221900        3      1.00        1180     5650      1
## 2           1 6414100192  538000        3      2.25        2570     7242      2
## 3           2 5631500400  180000        2      1.00         770    10000      1
## 4           3 2487200875  604000        4      3.00        1960     5000      1
## 5           4 1954400510  510000        3      2.00        1680     8080      1
## 6           5 7237550310 1225000        4      4.50        5420   101930      1
## 7           6 1321400060  257500        3      2.25        1715     6819      2
## 8           7 2008000270  291850        3      1.50        1060     9711      1
## 9           8 2414600126  229500        3      1.00        1780     7470      1
##   waterfront view condition grade sqft_above sqft_basement yr_built
## 1          0    0         3     7       1180             0     1955
## 2          0    0         3     7       2170           400     1951
## 3          0    0         3     6        770             0     1933
## 4          0    0         5     7       1050           910     1965
## 5          0    0         3     8       1680             0     1987
## 6          0    0         3    11       3890          1530     2001
## 7          0    0         3     7       1715             0     1995
## 8          0    0         3     7       1060             0     1963
## 9          0    0         3     7       1050           730     1960
##   yr_renovated zipcode     lat     long sqft_living15 sqft_lot15
## 1            0   98178 47.5112 -122.257          1340       5650
## 2         1991   98125 47.7210 -122.319          1690       7639
## 3            0   98028 47.7379 -122.233          2720       8062
## 4            0   98136 47.5208 -122.393          1360       5000
## 5            0   98074 47.6168 -122.045          1800       7503
## 6            0   98053 47.6561 -122.005          4760     101930
## 7            0   98003 47.3097 -122.327          2238       6819
## 8            0   98198 47.4095 -122.315          1650       9711
## 9            0   98146 47.5123 -122.337          1780       8113

Inspection of the dataset

# Atributes names.

names(Houses)
##  [1] "attribute_0"   "id"            "price"         "bedrooms"     
##  [5] "bathrooms"     "sqft_living"   "sqft_lot"      "floors"       
##  [9] "waterfront"    "view"          "condition"     "grade"        
## [13] "sqft_above"    "sqft_basement" "yr_built"      "yr_renovated" 
## [17] "zipcode"       "lat"           "long"          "sqft_living15"
## [21] "sqft_lot15"
summary(Houses)
##   attribute_0          id                price            bedrooms     
##  Min.   :    0   Min.   :1.000e+06   Min.   :  75000   Min.   : 0.000  
##  1st Qu.: 5403   1st Qu.:2.123e+09   1st Qu.: 321950   1st Qu.: 3.000  
##  Median :10806   Median :3.905e+09   Median : 450000   Median : 3.000  
##  Mean   :10806   Mean   :4.580e+09   Mean   : 540088   Mean   : 3.371  
##  3rd Qu.:16209   3rd Qu.:7.309e+09   3rd Qu.: 645000   3rd Qu.: 4.000  
##  Max.   :21612   Max.   :9.900e+09   Max.   :7700000   Max.   :33.000  
##                                                                        
##    bathrooms      sqft_living       sqft_lot           floors     
##  Min.   :0.000   Min.   :  290   Min.   :    520   Min.   :1.000  
##  1st Qu.:1.750   1st Qu.: 1427   1st Qu.:   5040   1st Qu.:1.000  
##  Median :2.250   Median : 1910   Median :   7618   Median :1.500  
##  Mean   :2.115   Mean   : 2080   Mean   :  15107   Mean   :1.494  
##  3rd Qu.:2.500   3rd Qu.: 2550   3rd Qu.:  10688   3rd Qu.:2.000  
##  Max.   :8.000   Max.   :13540   Max.   :1651359   Max.   :3.500  
##                                                                   
##    waterfront            view          condition         grade       
##  Min.   :0.000000   Min.   :0.0000   Min.   :1.000   Min.   : 1.000  
##  1st Qu.:0.000000   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.: 7.000  
##  Median :0.000000   Median :0.0000   Median :3.000   Median : 7.000  
##  Mean   :0.007542   Mean   :0.2343   Mean   :3.409   Mean   : 7.657  
##  3rd Qu.:0.000000   3rd Qu.:0.0000   3rd Qu.:4.000   3rd Qu.: 8.000  
##  Max.   :1.000000   Max.   :4.0000   Max.   :5.000   Max.   :13.000  
##                                                                      
##    sqft_above   sqft_basement       yr_built     yr_renovated   
##  Min.   : 290   Min.   :   0.0   Min.   :1900   Min.   :   0.0  
##  1st Qu.:1190   1st Qu.:   0.0   1st Qu.:1951   1st Qu.:   0.0  
##  Median :1560   Median :   0.0   Median :1975   Median :   0.0  
##  Mean   :1788   Mean   : 291.5   Mean   :1971   Mean   :  84.4  
##  3rd Qu.:2210   3rd Qu.: 560.0   3rd Qu.:1997   3rd Qu.:   0.0  
##  Max.   :9410   Max.   :4820.0   Max.   :2015   Max.   :2015.0  
##                                                                 
##     zipcode           lat             long        sqft_living15 
##  Min.   :98001   Min.   :47.16   Min.   :-122.5   Min.   : 399  
##  1st Qu.:98033   1st Qu.:47.47   1st Qu.:-122.3   1st Qu.:1490  
##  Median :98065   Median :47.57   Median :-122.2   Median :1840  
##  Mean   :98078   Mean   :47.56   Mean   :-122.2   Mean   :1987  
##  3rd Qu.:98118   3rd Qu.:47.68   3rd Qu.:-122.1   3rd Qu.:2360  
##  Max.   :98199   Max.   :47.78   Max.   :-121.3   Max.   :6210  
##                                                                 
##    sqft_lot15   
##  5000   :  427  
##  4000   :  357  
##  6000   :  289  
##  7200   :  211  
##  4800   :  145  
##  7500   :  142  
##  (Other):20042

The following command allow us to see the data types of all the variables.

str(Houses)
## 'data.frame':    21613 obs. of  21 variables:
##  $ attribute_0  : num  0 1 2 3 4 5 6 7 8 9 ...
##  $ id           : num  7.13e+09 6.41e+09 5.63e+09 2.49e+09 1.95e+09 ...
##  $ price        : num  221900 538000 180000 604000 510000 ...
##  $ bedrooms     : num  3 3 2 4 3 4 3 3 3 3 ...
##  $ bathrooms    : num  1 2.25 1 3 2 4.5 2.25 1.5 1 2.5 ...
##  $ sqft_living  : num  1180 2570 770 1960 1680 ...
##  $ sqft_lot     : num  5650 7242 10000 5000 8080 ...
##  $ floors       : num  1 2 1 1 1 1 2 1 1 2 ...
##  $ waterfront   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ view         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ condition    : num  3 3 3 5 3 3 3 3 3 3 ...
##  $ grade        : num  7 7 6 7 8 11 7 7 7 7 ...
##  $ sqft_above   : num  1180 2170 770 1050 1680 ...
##  $ sqft_basement: num  0 400 0 910 0 1530 0 0 730 0 ...
##  $ yr_built     : num  1955 1951 1933 1965 1987 ...
##  $ yr_renovated : num  0 1991 0 0 0 ...
##  $ zipcode      : num  98178 98125 98028 98136 98074 ...
##  $ lat          : num  47.5 47.7 47.7 47.5 47.6 ...
##  $ long         : num  -122 -122 -122 -122 -122 ...
##  $ sqft_living15: num  1340 1690 2720 1360 1800 ...
##  $ sqft_lot15   : Factor w/ 8689 levels "10000","10001",..: 5533 6933 7277 5064 6813 143 6321 8506 7326 6873 ...

Treatment of mixed datatypes

# Converting 'sqft_lot15' to numerical since it has too many levels.

Houses$sqft_lot15 <- as.numeric(Houses$sqft_lot15)
# We convert the square feet into square meters using the fact that 1 ft^2 = 0.09290304 m^2 .
Houses[,"sqft_living"] <- Houses[,"sqft_living"]*0.09290304
Houses[,"sqft_lot"] <- Houses[,"sqft_lot"]*0.09290304
Houses[,"sqft_above"] <- Houses[,"sqft_above"]*0.09290304
Houses[,"sqft_basement"] <- Houses[,"sqft_basement"]*0.09290304
Houses[,"sqft_living15"] <- Houses[,"sqft_living15"]*0.09290304
Houses[,"sqft_lot15"] <- Houses[,"sqft_lot15"]*0.09290304
# We rename the columns to fit the new data.
names(Houses)[6] = "sqm_living"
names(Houses)[7] = "sqm_lot"
names(Houses)[13] = "sqm_above"
names(Houses)[14] = "sqm_basement"
names(Houses)[20] = "sqm_living15"
names(Houses)[21] = "sqm_lot15"
Houses[1:9,]
##   attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot floors
## 1           0 7129300520  221900        3      1.00  109.62559  524.9022      1
## 2           1 6414100192  538000        3      2.25  238.76081  672.8038      2
## 3           2 5631500400  180000        2      1.00   71.53534  929.0304      1
## 4           3 2487200875  604000        4      3.00  182.08996  464.5152      1
## 5           4 1954400510  510000        3      2.00  156.07711  750.6566      1
## 6           5 7237550310 1225000        4      4.50  503.53448 9469.6069      1
## 7           6 1321400060  257500        3      2.25  159.32871  633.5058      2
## 8           7 2008000270  291850        3      1.50   98.47722  902.1814      1
## 9           8 2414600126  229500        3      1.00  165.36741  693.9857      1
##   waterfront view condition grade sqm_above sqm_basement yr_built yr_renovated
## 1          0    0         3     7 109.62559      0.00000     1955            0
## 2          0    0         3     7 201.59960     37.16122     1951         1991
## 3          0    0         3     6  71.53534      0.00000     1933            0
## 4          0    0         5     7  97.54819     84.54177     1965            0
## 5          0    0         3     8 156.07711      0.00000     1987            0
## 6          0    0         3    11 361.39283    142.14165     2001            0
## 7          0    0         3     7 159.32871      0.00000     1995            0
## 8          0    0         3     7  98.47722      0.00000     1963            0
## 9          0    0         3     7  97.54819     67.81922     1960            0
##   zipcode     lat     long sqm_living15 sqm_lot15
## 1   98178 47.5112 -122.257     124.4901 514.03252
## 2   98125 47.7210 -122.319     157.0061 644.09678
## 3   98028 47.7379 -122.233     252.6963 676.05542
## 4   98136 47.5208 -122.393     126.3481 470.46099
## 5   98074 47.6168 -122.045     167.2255 632.94841
## 6   98053 47.6561 -122.005     442.2185  13.28513
## 7   98003 47.3097 -122.327     207.9170 587.24012
## 8   98198 47.4095 -122.315     153.2900 790.23326
## 9   98146 47.5123 -122.337     165.3674 680.60767

PCA

# Seleccionar només les variables que volguem

Houses_for_pca <- subset(Houses, select = - c(id, attribute_0))
(houses_pca <- prcomp(Houses_for_pca, scale = TRUE))
## Standard deviations (1, .., p=19):
##  [1] 2.377625e+00 1.546918e+00 1.244706e+00 1.147317e+00 1.039631e+00
##  [6] 1.010273e+00 9.495284e-01 9.349664e-01 9.111071e-01 8.097547e-01
## [11] 7.584549e-01 7.159767e-01 6.478718e-01 5.729237e-01 5.260024e-01
## [16] 4.865578e-01 4.396953e-01 4.273636e-01 5.447382e-15
## 
## Rotation (n x k) = (19 x 19):
##                      PC1          PC2          PC3         PC4          PC5
## price         0.30202058  0.298714782 -0.062212848  0.06000036 -0.136785565
## bedrooms      0.23533956  0.073943300  0.189362063 -0.35354908  0.188577222
## bathrooms     0.35110915 -0.002150944 -0.043928175 -0.13514676  0.120946417
## sqm_living    0.38814146  0.120537670  0.093271118 -0.11588189  0.016982002
## sqm_lot       0.08014006 -0.063374400  0.258215404  0.28661828 -0.407975745
## floors        0.22309796 -0.192523280 -0.443013596  0.03179641  0.079355713
## waterfront    0.06077447  0.216014438 -0.007122435  0.57701574  0.284898070
## view          0.12886277  0.326489057  0.046662605  0.43637029  0.195368204
## condition    -0.07043308  0.225965294  0.389845196 -0.12415168 -0.165847577
## grade         0.36289983  0.029757279 -0.113054440 -0.01685475 -0.051579213
## sqm_above     0.37677486 -0.076563224 -0.047912778 -0.01468944  0.008950074
## sqm_basement  0.10050529  0.393397783  0.283206544 -0.21299546  0.018495144
## yr_built      0.21790947 -0.387271025 -0.132456427  0.01911433  0.146108565
## yr_renovated  0.01316716  0.180586468 -0.063268118  0.18231485  0.097771897
## zipcode      -0.13397813  0.328118588 -0.410637939 -0.10887227  0.045462923
## lat           0.02390311  0.242691441 -0.346109652 -0.20530022 -0.462442091
## long          0.15403740 -0.365791949  0.301728258  0.14659501 -0.186010491
## sqm_living15  0.34553607  0.026336592  0.093280020  0.01162947 -0.073067123
## sqm_lot15    -0.07037990 -0.014521050  0.165866754 -0.25034212  0.566905104
##                       PC6         PC7           PC8          PC9         PC10
## price         0.009695952 -0.21367051  0.1037508173 -0.029122700  0.077076006
## bedrooms     -0.090844512  0.17374377  0.0027827083  0.131792452 -0.488036449
## bathrooms    -0.003599118  0.19470451 -0.0185767715  0.050324195 -0.149648307
## sqm_living   -0.042637934  0.01636435 -0.0686796186  0.021429639  0.046620494
## sqm_lot      -0.221778135 -0.05761469 -0.6901011758  0.220987518 -0.214265401
## floors        0.008034394  0.02418795  0.0677668951  0.346466779 -0.157871023
## waterfront    0.205618720 -0.10668336  0.1309115246 -0.070884097 -0.535694472
## view          0.184235038  0.01280516 -0.1026331737 -0.053622654  0.273963553
## condition     0.224567178 -0.19206214  0.3578302903  0.554092140 -0.051134976
## grade         0.046604662 -0.07912441 -0.0071584991  0.011267230  0.241064550
## sqm_above    -0.092964978 -0.20704132  0.0004714251  0.204420171  0.061289560
## sqm_basement  0.085461296  0.42134941 -0.1434074389 -0.338014184 -0.017929533
## yr_built      0.228514272  0.17375695 -0.0903969133 -0.163055098 -0.064162845
## yr_renovated -0.845284783  0.14123679  0.2770684326 -0.007254797  0.024504733
## zipcode       0.017876531  0.02170621 -0.3680044859  0.196122921 -0.005471683
## lat           0.014992231 -0.34465539  0.0844999967 -0.403496069 -0.339253378
## long         -0.076599089 -0.12671029  0.1692768957 -0.288222116 -0.142711651
## sqm_living15  0.017348783 -0.14407228  0.0251327356 -0.097506149  0.305497144
## sqm_lot15    -0.149546681 -0.63306433 -0.2649728095 -0.135416584 -0.010679946
##                     PC11         PC12         PC13         PC14        PC15
## price        -0.01587609  0.199658023 -0.228945055 -0.199217847  0.41171585
## bedrooms      0.44839278 -0.310191171 -0.156937967  0.225030211 -0.01246251
## bathrooms    -0.30772989 -0.010159655  0.033366864 -0.013304226  0.27002146
## sqm_living    0.06243708  0.111683436  0.106119099 -0.190961079  0.04132992
## sqm_lot      -0.13734523  0.039498459 -0.131007136  0.075390445 -0.07140419
## floors       -0.27104903 -0.190656878 -0.140394110 -0.459186559 -0.41968330
## waterfront    0.08854052  0.349083795  0.134678419  0.034728031 -0.11840269
## view         -0.04220131 -0.688186148 -0.162892352  0.064262871  0.07373766
## condition    -0.37300567 -0.071797057  0.189209918  0.185845752 -0.04880691
## grade        -0.07446169  0.187714319 -0.009057401  0.294651129  0.04167640
## sqm_above     0.22565459  0.057189271  0.082544309 -0.063895587  0.16439899
## sqm_basement -0.29264589  0.124762380  0.065773982 -0.276732968 -0.22183408
## yr_built     -0.36025961  0.006205142  0.118365642  0.498751681  0.08059578
## yr_renovated -0.20992972 -0.022383135  0.095927123  0.205094899 -0.04260926
## zipcode       0.09644417 -0.088213414  0.674699108  0.009301181  0.12401563
## lat          -0.12926000 -0.214842470 -0.091367471  0.168127636 -0.10220589
## long         -0.01197953 -0.320415440  0.499569177 -0.311332663  0.16827412
## sqm_living15  0.23573501  0.039213631  0.211128623  0.174341945 -0.63978760
## sqm_lot15    -0.24763507 -0.007870628 -0.032988606 -0.051413265 -0.05374007
##                      PC16        PC17         PC18          PC19
## price        -0.217386590  0.34834557 -0.509646570 -7.155817e-15
## bedrooms     -0.235558002  0.12918510 -0.062134741  2.217223e-16
## bathrooms     0.619233485  0.39145906  0.270198414 -1.021713e-15
## sqm_living    0.081766178 -0.49125560 -0.013229746 -6.992604e-01
## sqm_lot      -0.007370041  0.04536237 -0.017916288 -1.703924e-16
## floors       -0.200728392  0.03360977 -0.038108423 -1.219782e-15
## waterfront    0.019860673 -0.02318549  0.093239664 -1.205997e-16
## view          0.039615475 -0.09325626  0.041406403 -6.911249e-17
## condition    -0.020655213 -0.06040049 -0.054797757 -1.257580e-16
## grade        -0.510423001  0.11105408  0.618615656  4.878886e-16
## sqm_above     0.181893210 -0.47937525 -0.006321435  6.304719e-01
## sqm_basement -0.170652924 -0.12251688 -0.015626765  3.369571e-01
## yr_built     -0.121550571 -0.17624790 -0.443926175  1.954320e-16
## yr_renovated -0.043489137 -0.05509836 -0.077200491 -8.122276e-17
## zipcode      -0.099050163  0.11309314 -0.082237251 -3.321866e-17
## lat           0.129513263 -0.16863937  0.074452279  5.914339e-17
## long         -0.200651253  0.13422056  0.073643621  1.348441e-16
## sqm_living15  0.237402468  0.31489957 -0.210431995 -2.006462e-16
## sqm_lot15    -0.042100870  0.04092750  0.004723152 -1.195691e-16
fviz_pca_biplot(houses_pca, repel = TRUE, axes = c(1,2), col.ind = "#CDC5BF", col.var = "#7A67EE")
## Warning: ggrepel: 21612 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

fviz_eig(houses_pca, ylim = c(1, 100), addlabels = TRUE, barcolor = 1, barfill = "darkorange2", 
         main = "Scree Plot of nyc_PCA")

Data visualization

# The houses are placed in a map in order to properly visualize the dataset. 

# We arbitrarily separate the houses in three distinct categories. 

high_calif <- Houses$grade >= 9 
med_calif <- (Houses$grade < 9 & Houses$grade > 4)
low_calif <- Houses$grade <= 4
Houses_HG <- subset(Houses, high_calif == TRUE) # High grade
Houses_MG <- subset(Houses, med_calif == TRUE) # Medium grade
Houses_LG <- subset(Houses, low_calif == FALSE) # Low grade

set_points <- data.frame( lat = c(min(Houses$lat) - 0.07, max(Houses$lat) + 0.07, max(Houses$lat) + 0.07,
                                  min(Houses$lat) - 0.07, min(Houses$lat) - 0.07),
                          long = c(min(Houses$long) - 0.07, min(Houses$long) - 0.07, max(Houses$long) + 0.07,
                                   max(Houses$long) + 0.07, min(Houses$long) - 0.07)) # Added extra lat and long in order to box observations better



m <- leaflet() %>%
  addTiles() %>%
 addCircleMarkers(data = Houses_LG, lng = Houses_LG$long, lat = Houses_LG$lat, popup = "Seattle",
                   radius = 1, color = "#FF1493", stroke = F, opacity = 0.5) %>%
  addCircleMarkers(data = Houses_MG, lng = Houses_MG$long, lat = Houses_MG$lat, popup = "Seattle",
                   radius = 0.5, color = "black", stroke = F, opacity = 2) %>%
  addCircleMarkers(data = Houses_HG, lng = Houses_HG$long, lat = Houses_HG$lat, popup = "Seattle",
                   radius = 0.5, color = "#00BFFF", stroke = F, opacity = 5) %>%
  addPolylines(data = set_points, lng = ~long, lat = ~lat, weight = 3,
              opacity = 6, col = "black")
  addLegend(m, position = "topright", labels = c("High", "Medium","Low"), colors = c("#00BFFF", "black","#FF1493"),
            title = "Grade qualification")

Categorization of variables

# Certain variables are transformed into factors to represent them correctly.

Houses$waterfront <- as.factor(Houses$waterfront)

Houses$view <- as.factor(Houses$view)

Houses$condition <- factor(Houses$condition)

Houses$floors <- as.factor(Houses$floors)

Feature extraction

# A new variable is created using already existent ones.

Houses$age <- 2015 - apply(dplyr::select(Houses, yr_built, yr_renovated), FUN = max, MARGIN = 1) # This new variable represents the years passed with no modifications done to the house.

Splitting of the data

# The data is separated into a training dataset and a testing dataset. 

set.seed(18) # This line is for reproducibility

sample <- sample.split(Houses, SplitRatio = 0.8)
Houses_tr <- subset(Houses, sample == TRUE)
Houses_te <- subset(Houses, sample == FALSE)

Treatment of missing values (after splitting)

# We check if there is NA values.

sample3 <- is.na(Houses_tr)
table(sample3)
## sample3
##  FALSE 
## 367422

We observe no NA values defined as such.

We’ll study if variables’ values are coherent.

# Search of missing values on numerical variables

cat('\n', "Bedrooms")
## 
##  Bedrooms
table(Houses_tr[,"bedrooms"] == 0)
## 
## FALSE  TRUE 
## 16689    12
cat('\n', "Bathrooms")
## 
##  Bathrooms
table(Houses_tr[,"bathrooms"] == 0)
## 
## FALSE  TRUE 
## 16692     9
cat('\n', "Sqm_living")
## 
##  Sqm_living
table(Houses_tr[,"sqm_living"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Sqm_lot")
## 
##  Sqm_lot
table(Houses_tr[,"sqm_lot"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Floors")
## 
##  Floors
table(Houses_tr[,"floors"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Sqm_above")
## 
##  Sqm_above
table(Houses_tr[,"sqm_above"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Sqm_basement")
## 
##  Sqm_basement
table(Houses_tr[,"sqm_basement"] == 0)
## 
## FALSE  TRUE 
##  6535 10166
cat('\n', "Yr_built")
## 
##  Yr_built
table(Houses_tr[,"yr_built"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Yr_renovated")
## 
##  Yr_renovated
table(Houses_tr[,"yr_renovated"] == 0)
## 
## FALSE  TRUE 
##   722 15979
cat('\n', "Zipcode")
## 
##  Zipcode
table(Houses_tr[,"zipcode"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Lat")
## 
##  Lat
table(Houses_tr[,"lat"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Long")
## 
##  Long
table(Houses_tr[,"long"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Sqm_living15")
## 
##  Sqm_living15
table(Houses_tr[,"sqm_living15"] == 0)
## 
## FALSE 
## 16701
cat('\n', "Sqm_lot15")
## 
##  Sqm_lot15
table(Houses_tr[,"sqm_lot15"] == 0)
## 
## FALSE 
## 16701

We observe houses containing \(0\) bedrooms and/or bathrooms, which initially does no make sense.

# We check the houses with a incoherent value for bathrooms and/or bedrooms.

sample1 <- Houses_tr$bedrooms == 0
(House_no_bed <- subset(Houses_tr, sample1 == TRUE))
##       attribute_0         id   price bedrooms bathrooms sqm_living     sqm_lot
## 876           875 6306400140 1095000        0      0.00  284.65491   442.59008
## 3120         3119 3918400017  380000        0      0.00  136.56747    90.95208
## 3468         3467 1453602309  288000        0      1.50  132.85135   153.29002
## 4869         4868 6896300380  228000        0      1.00   36.23219   548.12794
## 6995         6994 2954400190 1295650        0      0.00  446.86362  2602.02834
## 8478         8477 2569500210  339950        0      2.50  212.74796   772.86039
## 9774         9773 3374500520  355000        0      0.00  228.54148   747.77657
## 9855         9854 7849202190  235000        0      0.00  136.56747   445.93459
## 12654       12653 7849202299  320000        0      2.50  138.42553   660.63352
## 14424       14423 9543000205  139950        0      0.00   78.41017   396.60308
## 18380       18379 1222029077  265000        0      0.75   35.67477 19829.59647
## 19453       19452 3980300371  142000        0      0.00   26.94188  1939.35096
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 876      3.5          0    2         3     7 284.65491            0     1990
## 3120       3          0    2         3     8 136.56747            0     2006
## 3468       3          0    0         3     7 132.85135            0     1999
## 4869       1          0    0         2     4  36.23219            0     1953
## 6995       2          0    0         3    12 446.86362            0     1990
## 8478       2          0    0         3     8 212.74796            0     1985
## 9774       2          0    0         3     8 228.54148            0     1990
## 9855       2          0    0         3     7 136.56747            0     1996
## 12654      2          0    0         3     7 138.42553            0     1999
## 14424      1          0    0         4     7  78.41017            0     1913
## 18380      1          0    0         3     4  35.67477            0     2003
## 19453      1          0    0         1     1  26.94188            0     1963
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 876              0   98102 47.6362 -122.322    219.25117  403.1063  25
## 3120             0   98133 47.7145 -122.356    136.56747  156.7274   9
## 3468             0   98125 47.7222 -122.290    132.85135  211.3544  16
## 4869             0   98118 47.5260 -122.261    201.59960  536.4222  62
## 6995             0   98053 47.6642 -122.069    440.36041  355.9115  25
## 8478             0   98042 47.3473 -122.151    232.25760  728.7314  30
## 9774             0   98031 47.4095 -122.168    234.11566  675.1264  25
## 9855             0   98065 47.5265 -121.828     98.47722  610.3730  19
## 12654            0   98065 47.5261 -121.826    139.35456  450.9514  16
## 14424            0   98001 47.2781 -122.250    128.20620  781.9649 102
## 18380            0   98070 47.4177 -122.491    178.37384  284.5620  12
## 19453            0   98024 47.5308 -121.888    150.50292  287.4420  52
sample2 <- Houses_tr$bathrooms == 0
(House_no_bath <- subset(Houses_tr, sample2 == TRUE))
##       attribute_0         id   price bedrooms bathrooms sqm_living    sqm_lot
## 876           875 6306400140 1095000        0         0  284.65491  442.59008
## 1150         1149 3421079032   75000        1         0   62.24504 4029.85517
## 3120         3119 3918400017  380000        0         0  136.56747   90.95208
## 6995         6994 2954400190 1295650        0         0  446.86362 2602.02834
## 9774         9773 3374500520  355000        0         0  228.54148  747.77657
## 9855         9854 7849202190  235000        0         0  136.56747  445.93459
## 10482       10481  203100435  484000        1         0   64.10310 2159.43826
## 14424       14423 9543000205  139950        0         0   78.41017  396.60308
## 19453       19452 3980300371  142000        0         0   26.94188 1939.35096
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 876      3.5          0    2         3     7 284.65491            0     1990
## 1150       1          0    0         3     3  62.24504            0     1966
## 3120       3          0    2         3     8 136.56747            0     2006
## 6995       2          0    0         3    12 446.86362            0     1990
## 9774       2          0    0         3     8 228.54148            0     1990
## 9855       2          0    0         3     7 136.56747            0     1996
## 10482      1          0    0         4     7  64.10310            0     1948
## 14424      1          0    0         4     7  78.41017            0     1913
## 19453      1          0    0         1     1  26.94188            0     1963
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 876              0   98102 47.6362 -122.322    219.25117  403.1063  25
## 1150             0   98022 47.2638 -121.906    107.76753  425.3101  49
## 3120             0   98133 47.7145 -122.356    136.56747  156.7274   9
## 6995             0   98053 47.6642 -122.069    440.36041  355.9115  25
## 9774             0   98031 47.4095 -122.168    234.11566  675.1264  25
## 9855             0   98065 47.5265 -121.828     98.47722  610.3730  19
## 10482            0   98053 47.6429 -121.955    157.00614  248.3298  67
## 14424            0   98001 47.2781 -122.250    128.20620  781.9649 102
## 19453            0   98024 47.5308 -121.888    150.50292  287.4420  52

As no NANs are found, we transform the zero values of bedrooms and bathrooms into NANs. By this, we’ll be able to impute them a value based on their neighbours

Houses_tr$bedrooms[Houses_tr$bedrooms == 0] <- NA
Houses_tr$bathrooms[Houses_tr$bathrooms == 0] <- NA
summary(Houses_tr)
##   attribute_0          id                price            bedrooms     
##  Min.   :    1   Min.   :1.000e+06   Min.   :  75000   Min.   : 1.000  
##  1st Qu.: 5403   1st Qu.:2.120e+09   1st Qu.: 322500   1st Qu.: 3.000  
##  Median :10807   Median :3.905e+09   Median : 450000   Median : 3.000  
##  Mean   :10806   Mean   :4.575e+09   Mean   : 538594   Mean   : 3.371  
##  3rd Qu.:16209   3rd Qu.:7.300e+09   3rd Qu.: 642000   3rd Qu.: 4.000  
##  Max.   :21612   Max.   :9.900e+09   Max.   :7062500   Max.   :33.000  
##                                                        NA's   :12      
##    bathrooms       sqm_living         sqm_lot          floors     waterfront
##  Min.   :0.500   Min.   :  26.94   Min.   :    48.31   1  :8244   0:16572   
##  1st Qu.:1.750   1st Qu.: 131.92   1st Qu.:   467.30   1.5:1504   1:  129   
##  Median :2.250   Median : 177.44   Median :   706.06   2  :6351             
##  Mean   :2.113   Mean   : 192.79   Mean   :  1390.08   2.5: 123             
##  3rd Qu.:2.500   3rd Qu.: 236.90   3rd Qu.:   992.39   3  : 474             
##  Max.   :8.000   Max.   :1257.91   Max.   :153416.27   3.5:   5             
##  NA's   :9                                                                  
##  view      condition     grade          sqm_above       sqm_basement   
##  0:15063   1:   22   Min.   : 1.000   Min.   : 26.94   Min.   :  0.00  
##  1:  254   2:  141   1st Qu.: 7.000   1st Qu.:110.55   1st Qu.:  0.00  
##  2:  741   3:10798   Median : 7.000   Median :144.93   Median :  0.00  
##  3:  387   4: 4406   Mean   : 7.649   Mean   :165.91   Mean   : 26.87  
##  4:  256   5: 1334   3rd Qu.: 8.000   3rd Qu.:204.39   3rd Qu.: 52.03  
##                      Max.   :13.000   Max.   :874.22   Max.   :383.69  
##                                                                        
##     yr_built     yr_renovated        zipcode           lat       
##  Min.   :1900   Min.   :   0.00   Min.   :98001   Min.   :47.16  
##  1st Qu.:1951   1st Qu.:   0.00   1st Qu.:98033   1st Qu.:47.47  
##  Median :1975   Median :   0.00   Median :98065   Median :47.57  
##  Mean   :1971   Mean   :  86.29   Mean   :98078   Mean   :47.56  
##  3rd Qu.:1997   3rd Qu.:   0.00   3rd Qu.:98118   3rd Qu.:47.68  
##  Max.   :2015   Max.   :2015.00   Max.   :98199   Max.   :47.78  
##                                                                  
##       long         sqm_living15      sqm_lot15             age       
##  Min.   :-122.5   Min.   : 37.07   Min.   :  0.0929   Min.   :  0.0  
##  1st Qu.:-122.3   1st Qu.:137.50   1st Qu.:294.2239   1st Qu.: 16.0  
##  Median :-122.2   Median :170.94   Median :481.7023   Median : 38.0  
##  Mean   :-122.2   Mean   :184.47   Mean   :455.2054   Mean   : 41.8  
##  3rd Qu.:-122.1   3rd Qu.:220.18   3rd Qu.:641.4955   3rd Qu.: 61.0  
##  Max.   :-121.3   Max.   :537.91   Max.   :807.2345   Max.   :115.0  
## 
Bath <- data.frame(
  atr <- which(is.na(Houses_tr$bathrooms)),
  s <- Houses_tr$sqm_living[atr],
  Floors <- Houses_tr$floors[atr]
)  

ggplot(Bath, aes(x = atr, y = s, color = Floors)) + 
  geom_point(show.legend = TRUE, shape=18, size=3) +
  xlab('attribute_0') +
  ylab('sqm_living')  +
  labs(title = "MISSING VALUES OF BATHROOMS") + 
  scale_color_manual(values=c('#030303','#FF4500','#4EEE94', '#FFE7BA')) +
  theme_minimal()+theme(panel.background = element_rect(fill = "gray"),
                        legend.position="bottom")

Bed <- data.frame(
  atr <- which(is.na(Houses_tr$bedrooms)),
  s <- Houses_tr$sqm_living[atr],
  Floors <- Houses_tr$floors[atr]
)  

ggplot(Bed, aes(x = atr, y = s, color = Floors)) + 
  geom_point(show.legend = TRUE, shape=18, size=3) +
  xlab('attribute_0') +
  ylab('sqm_living')  +
  labs(title = "MISSING VALUES OF BEDROOMS") + 
  scale_color_manual(values=c('#030303','#FF4500','#4EEE94', '#FFE7BA')) +
  theme_minimal()+theme(panel.background = element_rect(fill = "gray"),
                        legend.position="bottom")

Imputation of new values to the missing values

# New values wil be assign to the missing values through the K-nn algorithm

names_columns <- colnames(Houses_tr)
var_to_use_knn <- names_columns[names_columns != c("attribute_0", "id")] # We don't want certain variables to be used for knn method

houses_knn <- kNN(Houses_tr, variable = c("bedrooms", "bathrooms"), dist_var = var_to_use_knn, k = 129, imp_var = FALSE) #imp_var avoids the creation of variables showing imputation status

Houses_tr <- houses_knn
summary(Houses_tr)
##   attribute_0          id                price            bedrooms    
##  Min.   :    1   Min.   :1.000e+06   Min.   :  75000   Min.   : 1.00  
##  1st Qu.: 5403   1st Qu.:2.120e+09   1st Qu.: 322500   1st Qu.: 3.00  
##  Median :10807   Median :3.905e+09   Median : 450000   Median : 3.00  
##  Mean   :10806   Mean   :4.575e+09   Mean   : 538594   Mean   : 3.37  
##  3rd Qu.:16209   3rd Qu.:7.300e+09   3rd Qu.: 642000   3rd Qu.: 4.00  
##  Max.   :21612   Max.   :9.900e+09   Max.   :7062500   Max.   :33.00  
##    bathrooms       sqm_living         sqm_lot          floors     waterfront
##  Min.   :0.500   Min.   :  26.94   Min.   :    48.31   1  :8244   0:16572   
##  1st Qu.:1.750   1st Qu.: 131.92   1st Qu.:   467.30   1.5:1504   1:  129   
##  Median :2.250   Median : 177.44   Median :   706.06   2  :6351             
##  Mean   :2.113   Mean   : 192.79   Mean   :  1390.08   2.5: 123             
##  3rd Qu.:2.500   3rd Qu.: 236.90   3rd Qu.:   992.39   3  : 474             
##  Max.   :8.000   Max.   :1257.91   Max.   :153416.27   3.5:   5             
##  view      condition     grade          sqm_above       sqm_basement   
##  0:15063   1:   22   Min.   : 1.000   Min.   : 26.94   Min.   :  0.00  
##  1:  254   2:  141   1st Qu.: 7.000   1st Qu.:110.55   1st Qu.:  0.00  
##  2:  741   3:10798   Median : 7.000   Median :144.93   Median :  0.00  
##  3:  387   4: 4406   Mean   : 7.649   Mean   :165.91   Mean   : 26.87  
##  4:  256   5: 1334   3rd Qu.: 8.000   3rd Qu.:204.39   3rd Qu.: 52.03  
##                      Max.   :13.000   Max.   :874.22   Max.   :383.69  
##     yr_built     yr_renovated        zipcode           lat       
##  Min.   :1900   Min.   :   0.00   Min.   :98001   Min.   :47.16  
##  1st Qu.:1951   1st Qu.:   0.00   1st Qu.:98033   1st Qu.:47.47  
##  Median :1975   Median :   0.00   Median :98065   Median :47.57  
##  Mean   :1971   Mean   :  86.29   Mean   :98078   Mean   :47.56  
##  3rd Qu.:1997   3rd Qu.:   0.00   3rd Qu.:98118   3rd Qu.:47.68  
##  Max.   :2015   Max.   :2015.00   Max.   :98199   Max.   :47.78  
##       long         sqm_living15      sqm_lot15             age       
##  Min.   :-122.5   Min.   : 37.07   Min.   :  0.0929   Min.   :  0.0  
##  1st Qu.:-122.3   1st Qu.:137.50   1st Qu.:294.2239   1st Qu.: 16.0  
##  Median :-122.2   Median :170.94   Median :481.7023   Median : 38.0  
##  Mean   :-122.2   Mean   :184.47   Mean   :455.2054   Mean   : 41.8  
##  3rd Qu.:-122.1   3rd Qu.:220.18   3rd Qu.:641.4955   3rd Qu.: 61.0  
##  Max.   :-121.3   Max.   :537.91   Max.   :807.2345   Max.   :115.0
# We choose k = 129 as it is sqrt(SIZE) of our training data. 

Identification and treatment of outliers

# We plot the values for each variable in order to visually find outliers.

# Price
ggplot(Houses_tr, aes(attribute_0, price)) + 
  geom_point(color = "#AB82FF") +
  ylab("price ($)") + 
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$price, decreasing = TRUE),])
##      attribute_0         id   price bedrooms bathrooms sqm_living  sqm_lot
## 3025        3914 9808700762 7062500        5      4.50   932.7465 3467.606
## 1017        1315 7558700030 5300000        6      6.00   686.5535 2306.690
## 900         1164 1247600105 5110800        5      5.25   744.1534 4228.668
## 2030        2626 7738500731 4500000        5      5.50   616.8762 3717.422
## 9559       12370 6065300370 4208000        5      6.00   691.1986 2001.131
## 3207        4149 6447300265 4000000        4      5.50   657.7535 1539.682
##      floors waterfront view condition grade sqm_above sqm_basement yr_built
## 3025      2          1    2         3    11  713.4953    219.25117     1940
## 1017      2          1    4         4    12  464.5152    222.03827     1991
## 900       2          1    4         3    12  556.4892    187.66414     1999
## 2030      2          1    4         3    12  589.9343     26.94188     2004
## 9559      2          0    0         3    12  515.6119    175.58675     2003
## 3207      2          0    0         3    12  535.1215    122.63201     2008
##      yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 3025         2001   98004 47.6500 -122.214     365.1089  302.4923  14
## 1017            0   98040 47.5631 -122.210     401.3411  297.3826  24
## 900             0   98033 47.6767 -122.211     318.6574  310.8536  16
## 2030            0   98155 47.7493 -122.280     281.4962  291.1581  11
## 9559            0   98006 47.5692 -122.189     440.3604  248.4227  12
## 3207            0   98039 47.6151 -122.224     291.7155  202.4357   7
# Bedrooms
ggplot(Houses_tr, aes(attribute_0, bedrooms)) + 
  geom_point(color = "#AB82FF")  +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$bedrooms, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot
## 12264       15870 2402100895  640000       33      1.75   150.5029  557.4182
## 6767         8757 1773100755  520000       11      3.00   278.7091  460.7991
## 10288       13314  627300145 1148000       10      5.25   426.4250 1014.5012
## 11715       15161 5566100170  650000       10      2.00   335.3800 1106.8468
## 14878       19254 8812401450  660000       10      3.00   271.2769  347.9219
## 3165         4096 1997200215  599999        9      4.50   355.8186  649.2064
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 12264      1          0    0         5     7  96.61916     53.88376     1947
## 6767       2          0    0         3     7 222.96730     55.74182     1918
## 10288      1          0    2         3     9 232.25760    194.16735     2008
## 11715      2          0    0         4     7 279.63815     55.74182     1958
## 14878      2          0    0         4     7 172.79965     98.47722     1913
## 3165     2.5          0    0         3     7 227.61245    128.20620     1938
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 12264            0   98103 47.6878 -122.331     123.5610 452.90232  68
## 6767          1999   98106 47.5560 -122.363     131.9223 468.23132  16
## 10288            0   98004 47.5861 -122.113     253.6253  24.06189   7
## 11715            0   98006 47.5705 -122.175     189.5222  92.25272  57
## 14878            0   98105 47.6635 -122.320     168.1545 382.76052 102
## 3165             0   98103 47.6927 -122.338     135.6384 555.37437  77
# Bathrooms
ggplot(Houses_tr, aes(attribute_0, bathrooms)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black")  +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$bathrooms, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living    sqm_lot
## 9874        12777 1225069038 2280000        7      8.00  1257.9072 28591.0964
## 6605         8546  424049043  450000        9      7.50   376.2573   604.2414
## 3110         4024 9175600025  800000        7      6.75   694.9147  3870.7123
## 15902       20578  424069279 1180000        6      6.50   581.5730  1017.7528
## 16619       21506 2524069097 2238890        5      6.50   675.4051 12078.9746
## 14143       18302 6072800246 3300000        5      6.25   745.0824  2019.5263
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874       3          0    4         3    12  874.2176    383.68956     1999
## 6605       2          0    0         3     7  376.2573      0.00000     1996
## 3110       2          0    2         3    11  471.9474    222.96730     1953
## 15902      2          0    0         3    11  449.6507    131.92232     2007
## 16619      2          0    0         3    12  596.4375     78.96758     2010
## 14143      2          0    0         3    11  745.0824      0.00000     2001
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 9874             0   98053 47.6675 -121.986     450.5797  277.1298  16
## 6605             0   98144 47.5923 -122.301     134.5236  393.3515  19
## 3110             0   98166 47.4643 -122.368     261.0575  343.3696  62
## 15902            0   98075 47.5947 -122.039     251.7672  116.5004   8
## 16619            0   98027 47.5371 -121.982     167.2255  438.3165   5
## 14143            0   98006 47.5675 -122.189     386.4766  244.9853  14
# Sqm_living
ggplot(Houses_tr, aes(attribute_0, sqm_living)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$sqm_living, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot
## 9874        12777 1225069038 2280000        7      8.00  1257.9072 28591.096
## 3025         3914 9808700762 7062500        5      4.50   932.7465  3467.606
## 14143       18302 6072800246 3300000        5      6.25   745.0824  2019.526
## 900          1164 1247600105 5110800        5      5.25   744.1534  4228.668
## 10364       13411 2426039123 2415000        5      4.75   732.0760  2252.899
## 12962       16773 1630700380 1920000        5      5.75   718.1405 21448.339
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874       3          0    4         3    12  874.2176    383.68956     1999
## 3025       2          1    2         3    11  713.4953    219.25117     1940
## 14143      2          0    0         3    11  745.0824      0.00000     2001
## 900        2          1    4         3    12  556.4892    187.66414     1999
## 10364      2          0    2         3    13  732.0760      0.00000     1996
## 12962      2          0    0         3    12  618.7342     99.40625     2004
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 9874             0   98053 47.6675 -121.986     450.5797 277.12977  16
## 3025          2001   98004 47.6500 -122.214     365.1089 302.49230  14
## 14143            0   98006 47.5675 -122.189     386.4766 244.98532  14
## 900              0   98033 47.6767 -122.211     318.6574 310.85357  16
## 10364            0   98177 47.7334 -122.362     254.5543  43.10701  19
## 12962            0   98077 47.7615 -122.084     247.1221 397.62501  11
# Sqm_lot
ggplot(Houses_tr, aes(attribute_0, sqm_lot)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$sqm_lot, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot
## 1328         1719 1020069017  700000        4      1.00  120.77395 153416.27
## 13383       17319 3326079016  190000        2      1.00   65.96116 108212.90
## 5910         7647 2623069031  542500        5      3.25  279.63815  99798.12
## 6003         7769 2323089009  855000        4      3.50  374.39925  95139.03
## 3432         4441 3626079040  790000        2      3.00  237.83178  91256.61
## 5170         6691 2624089007 1998000        2      2.50  362.32186  85510.09
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 1328       1          0    3         4     6 120.77395      0.00000     1920
## 13383      1          0    0         2     5  65.96116      0.00000     1915
## 5910     1.5          0    0         5     8 186.73511     92.90304     1931
## 6003       2          0    0         3    10 374.39925      0.00000     2006
## 3432       1          0    0         3     8 237.83178      0.00000     2004
## 5170       2          0    0         3    12 362.32186      0.00000     2009
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 1328             0   98022 47.2313 -122.023     237.8318 422.52303  95
## 13383            0   98014 47.6888 -121.909     156.0771 214.79183 100
## 5910             0   98027 47.4564 -122.004     227.6124 591.04914  84
## 6003             0   98045 47.4619 -121.744     170.0126  83.89145   9
## 3432             0   98014 47.6955 -121.861     150.5029 411.09595  11
## 5170             0   98065 47.5371 -121.756     252.6963 413.04692   6
# Sqm_above
ggplot(Houses_tr, aes(attribute_0, sqm_above)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$sqm_above, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot
## 9874        12777 1225069038 2280000        7      8.00  1257.9072 28591.096
## 14143       18302 6072800246 3300000        5      6.25   745.0824  2019.526
## 10364       13411 2426039123 2415000        5      4.75   732.0760  2252.899
## 3025         3914 9808700762 7062500        5      4.50   932.7465  3467.606
## 9174        11871 8835800350 1950000        4      3.25   689.3406 15595.540
## 14368       18594 3023069166 1135250        5      4.00   680.0503 20234.282
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874       3          0    4         3    12  874.2176     383.6896     1999
## 14143      2          0    0         3    11  745.0824       0.0000     2001
## 10364      2          0    2         3    13  732.0760       0.0000     1996
## 3025       2          1    2         3    11  713.4953     219.2512     1940
## 9174       2          0    3         3    12  689.3406       0.0000     2002
## 14368      2          0    0         3    11  680.0503       0.0000     1992
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 9874             0   98053 47.6675 -121.986     450.5797 277.12977  16
## 14143            0   98006 47.5675 -122.189     386.4766 244.98532  14
## 10364            0   98177 47.7334 -122.362     254.5543  43.10701  19
## 3025          2001   98004 47.6500 -122.214     365.1089 302.49230  14
## 9174             0   98045 47.4548 -121.764     521.1861 217.95053  13
## 14368            0   98058 47.4473 -122.086     303.7929 350.80188  23
# Sqm_basement
ggplot(Houses_tr, aes(attribute_0, sqm_basement)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$sqm_basement, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living   sqm_lot
## 9874        12777 1225069038 2280000        7      8.00  1257.9072 28591.096
## 11964       15482  624069108 3200000        4      3.25   650.3213  2620.423
## 7794        10085 7767000060 1900000        5      4.25   604.7988  1530.206
## 16493       21344 8835770170 1488000        5      6.00   639.1729 26009.878
## 5437         7035  853200010 3800000        5      5.50   654.9664  3979.966
## 5122         6628 3322049005  850000        4      2.75   505.3925 22257.710
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 9874       3          0    4         3    12  874.2176     383.6896     1999
## 11964      1          1    4         4    12  325.1606     325.1606     1991
## 7794       2          0    3         4    11  301.9349     302.8639     1980
## 16493      2          0    3         3    12  378.1154     261.0575     2007
## 5437       1          0    2         4    13  401.3411     253.6253     1978
## 5122       1          0    0         2     9  252.6963     252.6963     1969
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 9874             0   98053 47.6675 -121.986     450.5797  277.1298  16
## 11964            0   98075 47.5928 -122.086     456.4326  173.9145  24
## 7794             0   98040 47.5758 -122.242     416.2056  210.9828  35
## 16493            0   98045 47.4624 -121.779     435.7153  304.2575   8
## 5437             0   98004 47.6229 -122.220     471.0184  264.6808  37
## 5122             0   98001 47.3540 -122.293     183.0190  405.8934  46
# Yr_built
ggplot(Houses_tr, aes(attribute_0, age)) + 
  geom_point(color = "#AB82FF") +
  geom_point(data = Houses_tr[which(Houses_tr$bedrooms == 33),], colour = "black") +
  geom_point(data = Houses_tr[which(Houses_tr$sqm_living > 1000),], colour = "red")

head(Houses_tr[order(Houses_tr$yr_built, decreasing = TRUE),])
##       attribute_0         id   price bedrooms bathrooms sqm_living  sqm_lot
## 497           643 9385200045  729500        3      2.50  154.21905 101.3572
## 1362         1763 1832100030  597326        4      4.00  331.66385 766.4501
## 2076         2687 3076500830  385195        1      1.00   65.96116 557.4182
## 6213         8039 1250200495  455000        2      1.50  111.48365 116.9649
## 6511         8425  558100090  628000        5      2.75  241.54790 758.0888
## 10984       14215 8156600210 1285000        5      3.50  276.85106 473.8055
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 497        3          0    1         3     9 142.14165     12.07740     2015
## 1362       2          0    0         3    10 265.70269     65.96116     2015
## 2076     1.5          0    0         3     6  65.96116      0.00000     2015
## 6213       2          0    0         3     8  92.90304     18.58061     2015
## 6511       2          0    0         3     8 241.54790      0.00000     2015
## 10984      2          0    0         3    10 220.18020     56.67085     2015
##       yr_renovated zipcode     lat     long sqm_living15    sqm_lot15 age
## 497              0   98116 47.5818 -122.402     140.2836 144.92874240   0
## 1362             0   98040 47.5784 -122.226     207.1738   0.09290304   0
## 2076             0   98144 47.5756 -122.316     133.7804 458.19779328   0
## 6213             0   98144 47.6001 -122.298     122.6320 239.41113408   0
## 6511             0   98133 47.7348 -122.340     148.6449 684.41669568   0
## 10984            0   98115 47.6782 -122.299     165.3674 477.89323776   0

House \('1225069038'\) has a very large dimension on sqm_living, was built in 1999 but is considerably cheaper than other houses. We will delete this individual.

# Elimination of outlier. 

Houses_tr <- Houses_tr[!Houses_tr$id == 1225069038,]
if(! any(Houses_tr$id == 1225069038)){
  print("Deleted successfully")
}else{ 
  print("Error!")
}
## [1] "Deleted successfully"
# Manual imputation of bedrooms due to human error.
Houses_tr$bedrooms[Houses_tr$id == 2402100895] <- 3
(Houses_tr[Houses_tr$id == 2402100895,])
##       attribute_0         id  price bedrooms bathrooms sqm_living  sqm_lot
## 12264       15870 2402100895 640000        3      1.75   150.5029 557.4182
##       floors waterfront view condition grade sqm_above sqm_basement yr_built
## 12264      1          0    0         5     7  96.61916     53.88376     1947
##       yr_renovated zipcode     lat     long sqm_living15 sqm_lot15 age
## 12264            0   98103 47.6878 -122.331      123.561  452.9023  68
# We search for outliers using z score.

# We create a separate dataset to work on.
Houses_z <- Houses_tr

Houses_z$price_z_score <- abs(scale(Houses_z$price, center = TRUE, scale = TRUE))
Houses_z$bedrooms_z_score <- abs(scale(Houses_z$bedrooms, center = TRUE, scale = TRUE))
Houses_z$bathrooms_z_score <-  abs(scale(Houses_z$bathrooms, center = TRUE, scale = TRUE))
Houses_z$living_z_score <- abs(scale(Houses_z$sqm_living, center = TRUE, scale = TRUE))
Houses_z$lot_z_score <-  abs(scale(Houses_z$sqm_lot, center = TRUE, scale = TRUE))
Houses_z$above_z_score <-  abs(scale(Houses_z$sqm_above, center = TRUE, scale = TRUE))
Houses_z$age_z_score <-  abs(scale(Houses_z$age, center = TRUE, scale = TRUE))

collist <- c(23, 24, 25, 26, 27, 28, 29)

num <- vector()

for (row in 1:length(Houses_z[,1])) 
  { 
  sum <- 0
  for (element in collist) 
  {
    if (Houses_z[row,element] > 3){ # If the z_score value is higher than 3, we will count that as an outlier. 
      sum <- sum + 1
    }
  }
  num <- c(num, sum)
}

Houses_z$num <- num

# We will consider outliers those houses with a number higher of 3.
outlier_id <- Houses_z$id[Houses_z$num > 3]
Houses_tr <- Houses_tr[!(Houses_tr$id %in% outlier_id),]

Gaussianity

# We transform the data for price in order to obtain gaussianity. 

h <- hist(Houses_tr$price, xlab = "price ($)", main = "Histogram of price", 
     breaks = 25, ylim = c(0, 6000))
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$price, col = "#87CEFA", breaks = 25, ylim = c(0, 6000),  add = TRUE)
text(h$mids, h$counts, labels = h$counts, adj=c(0.5, -0.5), cex=0.55)

h <- hist(log(Houses_tr$price), xlab = "log(price)" , main = "Histogram of log(price)", 
     ylim = c(0,1), breaks = 15, prob = TRUE, col = "#87CEFA")
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$price, ylim = c(0,1), col = "#87CEFA",  add = TRUE,  breaks = 15, prob = TRUE)
curve(dnorm(x, mean(log(Houses_tr$price)), sd(log(Houses_tr$price))), col = "#EE6AA7", add = TRUE, lwd = 1.75)

Houses_tr$price = (log(Houses_tr$price)) # Apply it for the training set

Houses_te$price = (log(Houses_te$price)) # Apply it for the test set
# We transform the data for sqm_living in order to obtain gaussianity.

h <- hist(Houses_tr$sqm_living, xlab = "Squared metres of living", main = "Histogram of sqm_living", 
     breaks = 25, ylim = c(0, 6000))
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(Houses_tr$sqm_living, col = "#87CEFA", breaks = 25, ylim = c(0, 6000),  add = TRUE)
text(h$mids, h$counts, labels = h$counts, adj=c(0.5, -0.5), cex=0.55)

h <- hist(log(Houses_tr$sqm_living), xlab = "log(Squared metres of living)", main = "Histogram of log(sqm_living)", 
     xlim = c(3,8), ylim = c(0,1), breaks = 15, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(log(Houses_tr$sqm_living), ylim = c(0,1), col = "#87CEFA",  add = TRUE,  breaks = 15, prob = TRUE)
curve(dnorm(x, mean(log(Houses_tr$sqm_living)), sd(log(Houses_tr$sqm_living))), col = "#EE6AA7", add = TRUE, lwd = 1.75)

Houses_tr$sqm_living = (log(Houses_tr$sqm_living)) # Apply it for the training set.

Houses_te$sqm_living = (log(Houses_te$sqm_living)) # Apply it for the test set.
# We transform the data for sqm_lot in order to obtain gaussianity. 

hist(Houses_tr$sqm_lot, main="Squared metres of lot")

h <- hist(log(Houses_tr$sqm_lot), xlab = "sqm_lot_transf", main = "Transformed squared metres of lot", 
      ylim = c(0,5), breaks = 30, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist(log(Houses_tr$sqm_lot), ylim = c(0,5), col = "#87CEFA",  add = TRUE,  breaks = 30, prob = TRUE, xlab = "sqm_lot")
curve(dnorm(x, mean(log(Houses_tr$sqm_lot)), sd(log(Houses_tr$sqm_lot))), col = "#EE6AA7", add = TRUE, lwd = 1.75)

Houses_tr$sqm_lot = (log(Houses_tr$sqm_lot)) # Apply it for the training set.

Houses_te$sqm_lot = (log(Houses_te$sqm_lot)) # Apply it for the test set.
# We transform the data for sqm_above in order to obtain gaussianity. 

hist(Houses_tr$sqm_above, main="Squared metres above")

bx = boxcox(I(sqm_above) ~ . - id - attribute_0, data = Houses_tr,
             lambda = seq(-0.5, 0.5, length = 10))

lambda = bx$x[which.max(bx$y)]

sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 0.095960"
sqm_above_transf = (Houses_tr$sqm_above^lambda - 1)/lambda

h <- hist(sqm_above_transf, xlab = "sqm_above", main = "Squared metres above", 
      ylim = c(0,1), breaks = 30, prob = TRUE)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_above_transf), ylim = c(0,1), col = "#87CEFA",  add = TRUE,  breaks = 30, prob = TRUE)
curve(dnorm(x, mean(sqm_above_transf), sd(sqm_above_transf)), col = "#EE6AA7", add = TRUE, lwd = 1.75)

As it doesn’t correct the skewness totally, we decide not to apply it.

# We transform the data for sqm_above in order to obtain gaussianity. 

hist(Houses_tr$sqm_basement, main="Squared metres basement", breaks = 30, col = "#87CEFA", xlab = "sqm_basement")

It is already Gaussian.

# We transform the data for sqm_living15 in order to obtain gaussianity.

hist(Houses_tr$sqm_living15, main="Squared metres15", breaks = 30)

bx = boxcox(I(Houses_tr$sqm_living15) + 1 ~ . - id - attribute_0, data = Houses_tr,
             lambda = seq(-0.25, 0.25, length = 10)) # if we don't add 1, we would compute negative logarithms

lambda = bx$x[which.max(bx$y)]

sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 0.148990"
sqm_living15_transf = (Houses_tr$sqm_living15^lambda - 1)/lambda


h <- hist(sqm_living15_transf, xlab = "sqm_living15_transf", main = "Squared metres of living15", 
       breaks = 30)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_living15_transf),col = "#87CEFA",  add = TRUE,  breaks = 30)

We don’t apply the Box Cox Transformation.

# We transform the data for sqm_lot15 in order to obtain gaussianity.

hist(Houses_tr$sqm_lot15, main="Squared metres15", breaks = 30)

bx = boxcox(I(Houses_tr$sqm_lot15) + 1 ~ . - id - attribute_0, data = Houses_tr,
             lambda = seq(-1, 1, length = 10)) # if we don't add 1, we would compute negative logarithms

lambda = bx$x[which.max(bx$y)]

sprintf("The value of lambda used is: %f", lambda)
## [1] "The value of lambda used is: 1.000000"
sqm_lot15_transf = (Houses_tr$sqm_lot15^lambda - 1)/lambda

h <- hist(sqm_lot15_transf + 1, xlab = "sqm_lot15_transf", main = "Squared metres of lot15", 
       breaks = 30)
grid(nx = NA, ny = NULL, lty = 2, col = "gray", lwd = 1)
h <- hist((sqm_lot15_transf + 1),col = "#87CEFA",  add = TRUE,  breaks = 30)

We do not apply any transformation.

# We download the processed data to work in python

write.csv(Houses_tr,"Processed_tr.csv", row.names = FALSE)
write.csv(Houses_te,"Processed_te.csv", row.names = FALSE)

Correlation between variables

# Correlation plot of numerical variables

Houses_smth <- subset(Houses_tr, select = - c(id, attribute_0))
ggcorr(Houses_smth, hjust = 0.85,  size = 3, color = "black", type = "upper", layout.exp = 2, 
       label = TRUE, label_size = 2.5, low = "yellow", mid = "orange", high = "red")+
  labs(title = "Correlation Heat-Map between variables") +
  theme(plot.title = element_text(face = "bold", hjust = 0.5)) 
## Warning in ggcorr(Houses_smth, hjust = 0.85, size = 3, color = "black", : data
## in column(s) 'floors', 'waterfront', 'view', 'condition' are not numeric and
## were ignored
## Warning: Ignoring unknown parameters: type